library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
library(data.table)
## Warning: package 'data.table' was built under R version 3.2.5
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, last
library(tidyr)
library(knitr)
library(ggplot2)
library(ggrepel)
library(broom)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
## 
##     dcast, melt
combinewithoutuser<-read_csv("combinewithoutuser_complete.csv")
business<-readRDS("yelp_academic_dataset_business.rds")

business_simple<-business%>%filter(grepl("Restautrant|Food|Breakfast|Bars|food|Burgers|Brunch|Sandwiches|Pubs|Chinese|Italian|American|Pizza|Coffee|Tea|Fast Food|Asian|Fusion|Lounges|Cafes|Irish|Gluten|Salad|Diners|Seafood|Bakeries|Desserts|Japanese|Ice Cream & Frozen Yogurt|Tapas/Small Plates|Mediterranean|Wine Bars|Vegetarian|Portuguese|German|Delis|Chicken Wings|Hot Dogs|Polish|Greek|Sushi Bars|Indian|Mexican|Bagels|Donuts|Tapas Bars|Cocktail Bars|Ethnic Food|Middle Eastern|Steakhouses|Cafeteria|Candy Stores|Korean|Chocolatiers & Shops|Cheese Shops|Vietnamese|Thai|Tea Rooms|Latin American|Creperies|French|Taiwanese|Buffets|Cajun/Creole|Soul Food|Juice Bars & Smoothies|Fondue|Ethiopian|Persian/Iranian|Popcorn Shops|Spanish|Cheesesteaks|Fish & Chips|British|Kosher|Armenian|Cupcakes|Vegan|Hawaiian|Cuban|Gastropubs|Russian|Pretzels|Fruits & Veggies|Gelato|Halal|Dim Sum|Filipino|Pasta Shops|Mongolian|Colombian|Cantonese|Street Vendors|Belgian|Cambodian|Hungarian|Szechuan|Bubble Tea|Laotian|African|Beer Bar|Himalayan/Nepalese|Moroccan|Falafel|Indonesian|Turkish|Afghan|Food Stands|Modern European|Irish Pub|Brazilian|Food Court|Malaysian|Coffeeshops|Hot Pot|Burmese|Macarons|Ramen|Empanadas|Bistros|Teppanyaki|Brasseries|Singaporean|Champagne Bars|Scandinavian|Canadian|Poutineries|Haitian|Arabian|Austrian|Czech|Slovakian|Bangladeshi|Egyptian|Dominican|Scottish|Patisserie/Cake Shop|Pub Food|Puerto Rican|Australian|Ukrainian|Sri Lankan|Beer Garden|International|Beer Gardens|Serbo Croatian|Kebab|Alsatian|Oriental|Shanghainese|Venezuelan|Bavarian|Iberian|Curry Sausage|Rhinelandian|Beer Hall|Eastern European|Wok|Trinidadian|Swiss Food|Pita",categories))

business_simple<-business_simple%>%
  filter(city%in%c("Pittsburgh","Charlotte","Urbana","Champaign","Phoenix", "Scottsdale","Tempe","Mesa", "Chandler","Gilbert","Glendale", "Las Vegas","Henderson", "Madison"))
business_simple$state[business_simple$business_id=="g49oTp73Pk_WpOfQVtmcew"] <- "NV"
## Plotting the distribution of review_counts per stars
combinewithoutuser %>% ggplot(aes(x = stars)) + geom_bar()

combinewithoutuser %>% summarize(mean = mean(stars), var = var(stars)) ## not Poisson
## Source: local data frame [1 x 2]
## 
##      mean       var
##     (dbl)     (dbl)
## 1 3.54474 0.4090287
#library(gridExtra)
#grid.arrange(map_nv1,map_nv2, nrow=1)

#all_states <- map_data("state")
#ggplot() + geom_polygon( data=all_states, aes(x=long, y=lat, group = group),colour="white", fill="grey10" ) + geom_point(data=restaurant_loc, aes(longitude, latitude), col="red", cex=3) + geom_point(data=station_loc, aes(lon, lat), col="white", cex=2) + facet_wrap(~state)

Restaurant Locations

The longitudes and latitudes are used to map all restaurant locations in our dataset with R GIS (Geographical Information System). The size of the dots reflects the number of restaurants in a specific area. The bigger the dot, the larger the number of restaurants in the area.

## Distribution of all restaurants in our dataset
business_simple%>%group_by(city)%>%ggplot( aes(city) ) + geom_bar( stat="count", width = 0.8, fill="darkolivegreen3")

business_simple%>%group_by(state)%>%ggplot( aes(state) ) + geom_bar( stat="count", width = 0.8, fill="darkolivegreen3")

restaurant_loc<-read_csv("restaurant_loc.csv")
station_loc<-read_csv("station_loc.csv")
restaurant_loc<-combinewithoutuser%>%left_join(restaurant_loc)
## Joining by: "business_id"
restaurant_loc2<-restaurant_loc%>%filter(review_count > 100)

library(RgoogleMaps)
## Warning: package 'RgoogleMaps' was built under R version 3.2.5
PlotOnStaticMap(lat = restaurant_loc$latitude, lon = restaurant_loc$longitude,  zoom =5, size = c(640,640), TrueProj=TRUE, cex=1.4, pch=19, col="red3", FUN = points, add = F)

## NV
state<-restaurant_loc%>%filter(state=="NV")
map_nv1<-PlotOnStaticMap(lat = state$latitude, lon = state$longitude,  zoom =5, size = c(640,640), TrueProj=TRUE, cex=0.7, pch=19, col="red3", FUN = points, add = F)

state2<-restaurant_loc2%>%filter(state=="NV")
map_nv1<-PlotOnStaticMap(lat = state2$latitude, lon = state2$longitude,  zoom =5, size = c(640,640), TrueProj=TRUE, cex=0.7, pch=19, col="red3", FUN = points, add = F)

## PA
state<-restaurant_loc%>%filter(state=="PA")
map_nv1<-PlotOnStaticMap(lat = state$latitude, lon = state$longitude,  zoom =5, size = c(640,640), TrueProj=TRUE, cex=0.7, pch=19, col="red3", FUN = points, add = F)

state2<-restaurant_loc2%>%filter(state=="PA")
map_nv1<-PlotOnStaticMap(lat = state2$latitude, lon = state2$longitude,  zoom =5, size = c(640,640), TrueProj=TRUE, cex=0.7, pch=19, col="red3", FUN = points, add = F)

## NC
state<-restaurant_loc%>%filter(state=="NC")
map_nv1<-PlotOnStaticMap(lat = state$latitude, lon = state$longitude,  zoom =5, size = c(640,640), TrueProj=TRUE, cex=0.7, pch=19, col="red3", FUN = points, add = F)

state2<-restaurant_loc2%>%filter(state=="NC")
map_nv1<-PlotOnStaticMap(lat = state2$latitude, lon = state2$longitude,  zoom =5, size = c(640,640), TrueProj=TRUE, cex=0.7, pch=19, col="red3", FUN = points, add = F)

## IL
state<-restaurant_loc%>%filter(state=="IL")
map_nv1<-PlotOnStaticMap(lat = state$latitude, lon = state$longitude,  zoom =5, size = c(640,640), TrueProj=TRUE, cex=0.7, pch=19, col="red3", FUN = points, add = F)

state2<-restaurant_loc2%>%filter(state=="IL")
map_nv1<-PlotOnStaticMap(lat = state2$latitude, lon = state2$longitude,  zoom =5, size = c(640,640), TrueProj=TRUE, cex=0.7, pch=19, col="red3", FUN = points, add = F)

## AZ
state<-restaurant_loc%>%filter(state=="AZ")
map_nv1<-PlotOnStaticMap(lat = state$latitude, lon = state$longitude,  zoom =5, size = c(640,640), TrueProj=TRUE, cex=0.7, pch=19, col="red3", FUN = points, add = F)

state2<-restaurant_loc2%>%filter(state=="AZ")
map_nv1<-PlotOnStaticMap(lat = state2$latitude, lon = state2$longitude,  zoom =5, size = c(640,640), TrueProj=TRUE, cex=0.7, pch=19, col="red3", FUN = points, add = F)

## WI
state<-restaurant_loc%>%filter(state=="WI")
map_nv1<-PlotOnStaticMap(lat = state$latitude, lon = state$longitude,  zoom =5, size = c(640,640), TrueProj=TRUE, cex=0.7, pch=19, col="red3", FUN = points, add = F)

state2<-restaurant_loc2%>%filter(state=="WI")
map_nv1<-PlotOnStaticMap(lat = state2$latitude, lon = state2$longitude,  zoom =5, size = c(640,640), TrueProj=TRUE, cex=0.7, pch=19, col="red3", FUN = points, add = F)

Distribution of variables

## Type of restaurant
combinewithoutuser %>% ggplot(aes(type)) + geom_bar( stat="count", width = 0.8, fill="darkolivegreen3")

combinewithoutuser %>% group_by(state) %>% 
  ggplot(aes(type)) + geom_bar( stat="count", width = 0.8, fill="darkolivegreen3") +
  scale_y_log10() +
  theme(axis.text.x = element_text(angle = 40, size = 10, hjust = 0.9, vjust = 0.9)) +
  facet_wrap(~state)

## City
combinewithoutuser %>% ggplot(aes(city)) + geom_bar( stat="count", width = 0.8, fill="darkolivegreen3")

## State
combinewithoutuser %>% ggplot(aes(state)) + geom_bar( stat="count", width = 0.8, fill="darkolivegreen3")

## Review_count
combinewithoutuser %>% group_by(city) %>% 
  ggplot(aes(city, review_count, col=city)) + geom_boxplot() +
  ggtitle("Review_count, stratified by city")

combinewithoutuser %>% group_by(state) %>% 
  ggplot(aes(state, review_count, col=state)) + geom_boxplot() +
  ggtitle("Review_count, stratified by state")

## Review_count < 1000
combinewithoutuser %>% filter(review_count < 1000) %>% group_by(city) %>% 
  ggplot(aes(city, review_count, col=city)) + geom_boxplot() +
  ggtitle("Review_count, stratified by city")

combinewithoutuser %>% filter(review_count < 1000) %>% group_by(state) %>% 
  ggplot(aes(state, review_count, col=state)) + geom_boxplot() +
  ggtitle("Review_count, stratified by state")

## Stars
combinewithoutuser %>% group_by(city) %>% 
  ggplot(aes(city, stars, col=city)) + geom_boxplot() +
  ggtitle("Review_count, stratified by city")

combinewithoutuser %>% group_by(state) %>% 
  ggplot(aes(state, stars, col=state)) + geom_boxplot() +
  ggtitle("Review_count, stratified by state")

## Stars vs review_count
combinewithoutuser %>% group_by(stars) %>% 
  ggplot(aes(stars, review_count, col=stars, group=stars)) + geom_boxplot() +
  scale_y_log10() +
  ggtitle("Review_count, stratified by stars") + ylab("Review_count (log10 scale)")

Distribution of binary attributes

test <- combinewithoutuser %>% select(business_id,type,city,state,review_count,stars,takeout:goodforbreakfast,alcohol,price:noise,wifi)

## Overall averages of attributes
r_sum <- summary(test)
r_sum_df <- as.data.frame(r_sum[4,])
colnames(r_sum_df) <- "Average"
r_s <- as.data.frame(cbind (rownames(r_sum_df), substr(r_sum_df$Average, 9,16)))
names(r_s) <- c("Attribute", "Average")
r_s[-1:-4,] %>% kable
Attribute Average
5 review_count 113.3
6 stars 3.545
7 takeout 0.9421
8 reservation 0.3368
9 outdoorseating 0.4512
10 waiterservice 0.6105
11 creditcards 0.9818
12 goodforkids 0.8323
13 goodforgroups 0.9197
14 goodfordessert 0.03297
15 goodforlatenight 0.06838
16 goodforlunch 0.4446
17 goodfordinner 0.3071
18 goodforbrunch 0.09984
19 goodforbreakfast 0.1117
20 alcohol 0.5349
21 price 1.594
22 parking 0.9392
23 noise 0.9197
24 wifi 0.4032
## plotting of binary attributes
r_s_binary<-r_s[-1:-6,]
r_s_binary<-r_s_binary[-15:-18,]
r_s_binary %>% ggplot( aes(Attribute, Average) ) + geom_bar( stat="identity", position = "stack", width=0.9, fill="darkseagreen") + coord_flip(xlim=c(0,15), ylim=c(0,20)) + geom_text(aes(label=Average), hjust=1, color="white", size=3.5) + theme_minimal() + geom_hline(aes(yintercept=14), color="red")

## plotting of categorical attributes
r_s_cat<-r_s[21:24,]
r_s_cat$hline <- c(4,2,3,2)
r_s_cat %>% ggplot(aes(Attribute, Average)) + geom_bar( stat="identity", position = "stack", width=0.9, fill="darkseagreen") + coord_flip(xlim=c(0,5), ylim=c(0,10)) + geom_errorbar(aes(y=hline, ymax=hline, ymin=hline), colour="#AA0000") + geom_text(aes(label=Average), hjust=1, color="white", size=3.5) + theme_minimal()

## Averages of attributes by state
test_pa <- test %>% filter(state=="PA")
r_sum <- summary(test_pa)
r_sum_df <- as.data.frame(r_sum[4,])
colnames(r_sum_df) <- "Average"
r_s <- as.data.frame(cbind (rownames(r_sum_df), substr(r_sum_df$Average, 9,16)))
names(r_s) <- c("Attribute", "Average")
r_s[-1:-3,] %>% kable
Attribute Average
4 state NA
5 review_count 71.90
6 stars 3.63
7 takeout 0.9282
8 reservation 0.3942
9 outdoorseating 0.3625
10 waiterservice 0.7032
11 creditcards 0.9161
12 goodforkids 0.7153
13 goodforgroups 0.8504
14 goodfordessert 0.0146
15 goodforlatenight 0.07178
16 goodforlunch 0.4173
17 goodfordinner 0.4282
18 goodforbrunch 0.07664
19 goodforbreakfast 0.07543
20 alcohol 0.4939
21 price 1.687
22 parking 1.464
23 noise 0.9294
24 wifi 0.3771
## plotting of binary attributes
r_s_binary<-r_s[-1:-6,]
r_s_binary<-r_s_binary[-15:-18,]
r_s_binary %>% ggplot( aes(Attribute, Average) ) + geom_bar( stat="identity", position = "stack", width=0.9, fill="darkseagreen") + coord_flip(xlim=c(0,15), ylim=c(0,20)) + geom_text(aes(label=Average), hjust=1, color="white", size=3.5) + theme_minimal() + geom_hline(aes(yintercept=14), color="red")

## plotting of categorical attributes
r_s_cat<-r_s[21:24,]
r_s_cat$hline <- c(4,2,3,2)
r_s_cat %>% ggplot(aes(Attribute, Average)) + geom_bar( stat="identity", position = "stack", width=0.9, fill="darkseagreen") + coord_flip(xlim=c(0,5), ylim=c(0,10)) + geom_errorbar(aes(y=hline, ymax=hline, ymin=hline), colour="#AA0000") + geom_text(aes(label=Average), hjust=1, color="white", size=3.5) + theme_minimal()

Distribution of binary attributes by state

test <- combinewithoutuser %>% select(business_id,type,city,state,review_count,stars,takeout:goodforbreakfast,alcohol,price:noise,wifi)
r_sum_bin <- test%>%group_by(state)%>%select(state,takeout:alcohol)%>%summarise_each(funs(mean))

mdat <- melt(r_sum_bin, id.vars="state")
head(mdat)
##   state variable     value
## 1    AZ  takeout 0.9674077
## 2    IL  takeout 0.9636364
## 3    NC  takeout 0.9597378
## 4    NV  takeout 0.9095637
## 5    PA  takeout 0.9282238
## 6    WI  takeout 0.9081197
ggplot(mdat, aes(variable, value, fill=state)) + 
  geom_bar(stat="identity", position="dodge") + 
  geom_hline(yintercept=1, col="red", cex=0.5) +
  theme(axis.text.x = element_text(angle = 40, size = 10, hjust = 0.9, vjust = 0.9)) +
  ggtitle("Average attributes by state") + scale_fill_brewer() + 
  theme(panel.background = element_rect(fill = "black")) +
  theme(plot.background = element_blank() 
        ,panel.grid.major = element_blank() 
        ,panel.border = element_blank())

Distribution of categorical variables

r_sum_cat <- test%>%group_by(state)%>%select(state,stars,price:wifi)%>%summarise_each(funs(mean))

mdat <- melt(r_sum_cat, id.vars="state")
head(mdat)
##   state variable    value
## 1    AZ    stars 3.551263
## 2    IL    stars 3.454545
## 3    NC    stars 3.541199
## 4    NV    stars 3.516325
## 5    PA    stars 3.630170
## 6    WI    stars 3.580128
ggplot(mdat, aes(variable, value, fill=state)) + 
  geom_bar(stat="identity", position="dodge") + 
  theme(axis.text.x = element_text(angle = 40, size = 10, hjust = 0.9, vjust = 0.9)) +
  ggtitle("Average attributes by state") + scale_fill_brewer() + 
  theme(panel.background = element_rect(fill = "black")) +
  theme(plot.background = element_blank() 
        ,panel.grid.major = element_blank() 
        ,panel.border = element_blank())

test %>% ggplot(aes(stars)) + geom_histogram( stat="bin", bins=17, fill="lightskyblue")

test %>% ggplot(aes(price)) + geom_histogram( stat="bin", bins=10, fill="lightskyblue")

test %>% ggplot(aes(parking)) + geom_histogram( stat="bin", bins=10, fill="lightskyblue")

test %>% ggplot(aes(noise)) + geom_histogram( stat="bin", bins=10, fill="lightskyblue")

test %>% ggplot(aes(wifi)) + geom_histogram( stat="bin", bins=10, fill="lightskyblue")

Distribution of demographics

combinewithoutuser %>% group_by(state) %>% 
  ggplot(aes(state, population_zip, col=state)) + geom_boxplot()

combinewithoutuser %>% group_by(state) %>% 
  ggplot(aes(state, income_zip, col=state)) + geom_boxplot()

combinewithoutuser %>% group_by(state) %>% 
  ggplot(aes(state, age_zip, col=state)) + geom_boxplot()

combinewithoutuser %>% group_by(state) %>% 
  ggplot(aes(state, white, col=state)) + geom_boxplot()

combinewithoutuser %>% group_by(state) %>% 
  ggplot(aes(state, Black.or.African.American, col=state)) + geom_boxplot()

combinewithoutuser %>% group_by(state) %>% 
  ggplot(aes(state, American.Indian.and.Alaska.Native, col=state)) + geom_boxplot()

combinewithoutuser %>% group_by(state) %>% 
  ggplot(aes(state, Asian, col=state)) + geom_boxplot()

combinewithoutuser %>% group_by(state) %>% 
  ggplot(aes(state, Native.Hawaiian.and.Other.Pacific.Islander, col=state)) + geom_boxplot()

combinewithoutuser %>% group_by(state) %>% 
  ggplot(aes(state, other, col=state)) + geom_boxplot()

combinewithoutuser %>% group_by(state) %>% 
  ggplot(aes(state, LANDSQMI, col=state)) + geom_boxplot()

combinewithoutuser %>% group_by(state) %>% 
  ggplot(aes(state, popdensity_zip, col=state)) + geom_boxplot()

combinewithoutuser %>% group_by(state) %>% 
  ggplot(aes(state, education_zip, col=state)) + geom_boxplot()